library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
library(dendextend) # nice dendrograms
library(pheatmap) # nice heatmaps
library(caret) #another option for pca
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Challenge
Take a few moments to explore the Iris dataset. What can you learn? Which species do you think will be easier to separate?
{: .source}
Solution
dim(iris) str(iris) iris %>% ggplot(aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + geom_point() + theme_minimal() iris %>% ggplot(aes(x = Sepal.Length, y = Sepal.Width, col = Species)) + geom_point() + theme_minimal() iris %>% ggplot(aes(x = Petal.Width, y = Petal.Length, col = Species)) + geom_point() + theme_minimal() iris %>% gather(key, value, -Species) %>% ggplot(aes(y = value, fill = Species)) + geom_boxplot() + facet_wrap(.~key) + theme_bw(){: .output} {: .solution} {: .challenge}
What if we didn’t know we had 3 species??? Could we use the morphological data to study this problem?
iris_scaled <- scale(iris[,1:4])
rownames(iris_scaled) <- paste(substr(iris$Species, 1, 2), seq(1:length(iris$Species)), sep = "_")
distance <- get_dist(iris_scaled)
fviz_dist(distance)
k2 <- kmeans(iris_scaled, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
## $ centers : num [1:2, 1:4] -1.011 0.506 0.85 -0.425 -1.301 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:2] 47.4 173.5
## $ tot.withinss: num 221
## $ betweenss : num 375
## $ size : int [1:2] 50 100
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = iris_scaled) + theme_minimal()
set.seed(42)
fviz_nbclust(iris_scaled, kmeans, method = "wss")
fviz_nbclust(iris_scaled, kmeans, method = "silhouette")
fviz_nbclust(iris_scaled, kmeans, method = "gap_stat")
k3 <- kmeans(iris_scaled, centers = 3, nstart = 25)
str(k3)
## List of 9
## $ cluster : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
## $ centers : num [1:3, 1:4] 1.1322 -1.0112 -0.0501 0.0881 0.8504 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 47.5 47.4 44.1
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 47 50 53
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k3, data = iris_scaled) + theme_bw()
k7 <- kmeans(iris_scaled, centers = 7, nstart = 25)
str(k7)
## List of 9
## $ cluster : Named int [1:150] 2 1 1 1 2 2 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:150] "se_1" "se_2" "se_3" "se_4" ...
## $ centers : num [1:7, 1:4] -1.303 -0.719 0.289 0.954 0.36 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7] "1" "2" "3" "4" ...
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:7] 9.65 12.15 11.83 7.55 5.94 ...
## $ tot.withinss: num 71.1
## $ betweenss : num 525
## $ size : int [1:7] 25 25 29 21 17 21 12
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
fviz_cluster(k7, data = iris_scaled) + theme_bw()
Challenge
Choose whichever clustering approach you think worked best among the above. If you partition the data this way, which of the variables is most distinct in the clusters?
{: .source}
Solution
iris %>% mutate(Cluster = k3$cluster) %>% group_by(Cluster) %>% summarize (MostFreqSpecies =names(which.max(table(Species))), Sepal.Length = mean(Sepal.Length), Sepal.Width = mean(Sepal.Width), Petal.Width = mean(Petal.Width), Petal.Length = mean(Petal.Length)){: .output} {: .solution} {: .challenge}
The first step is to compute the distance between each sample, and by default, the complete linkage method is used.
iris_hcl <- hclust(dist(iris_scaled))
plot(iris_hcl)
# cut dendrogram in 3 clusters
dendcut = cutree(iris_hcl, 3)
table(dendcut, iris$Species)
##
## dendcut setosa versicolor virginica
## 1 49 0 0
## 2 1 21 2
## 3 0 29 48
iris_hcl %>% as.dendrogram() %>% plot()
fviz_dend(hcut(iris_scaled, k = 3, hc_method = "complete"),
rect = TRUE,
cex = 0.5,
palette = "Set1"
)
collabels <- data.frame(species = substr(rownames(iris_scaled), 1, 2))
row.names(collabels) <- rownames(iris_scaled)
pheatmap(iris_scaled,
cluster_rows = iris_hcl,
treeheight_row = 30,
treeheight_col = 30,
annotation_row = collabels)
Challenge
Try constructing a heatmap using another agglomeration method, and visualise the results.
Do you think your approach is better or worse than the “default”? Compare with your group…
{: .source}
Solution
iris_hcl <- hclust(dist(iris_scaled), method = "ward.D") pheatmap(iris_scaled, cluster_rows = iris_hcl, treeheight_row = 30, treeheight_col = 30, annotation_row = collabels){: .output} {: .solution} {: .challenge}
iris_pca <- prcomp(iris_scaled)
iris_scores = as.data.frame(iris_pca$x)
iris_scores$species <- substr(row.names(iris_scores), 1, 2)
# plot of observations
ggplot(data = iris_scores, aes(x = PC1, y = PC2, color = species, label = species)) +
geom_text(alpha = 0.8, size = 4) +
ggtitle("First two PC of Iris data") +
theme_minimal()
fviz_eig(iris_pca, addlabels = TRUE)
# plot of observations
ggplot(data = iris_scores, aes(x = PC2, y = PC3, color = species, label = species)) +
geom_text(alpha = 0.8, size = 4) +
ggtitle("PC2/3 of Iris data") +
theme_minimal()
Let’s look at the rotation matrix:
iris_pca$rotation
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863
## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
fviz_pca_var(iris_pca,
col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))
# Contributions of variables to PC1
fviz_contrib(iris_pca, choice = "var", axes = 1, top = 10)
# Contributions of variables to PC2
fviz_contrib(iris_pca, choice = "var", axes = 2, top = 10)
# Visualize
# Use habillage to specify groups for coloring
fviz_pca_ind(iris_pca,
label = "none",
habillage = iris$Species,
palette = "Set1",
addEllipses = TRUE
)
iris_trans <- preProcess(iris[,1:4], method=c("BoxCox", "center", "scale", "pca"))
iris_PCaret <- predict(iris_trans, iris[,1:4])
iris_PCaret # kept only the PCs that are necessary >= 95% of variability in the data
## PC1 PC2
## 1 -2.30353961 -0.474826012
## 2 -2.15130962 0.648290323
## 3 -2.46134124 0.346392126
## 4 -2.41396812 0.606683881
## 5 -2.43277736 -0.611005666
## 6 -1.97917155 -1.395059430
## 7 -2.50115378 -0.006518402
## 8 -2.28638088 -0.229199408
## 9 -2.48383388 1.159498745
## 10 -2.33581242 0.443987500
## 11 -2.17167287 -1.011051847
## 12 -2.40408015 -0.120269521
## 13 -2.38192257 0.714883104
## 14 -2.88538841 1.024170864
## 15 -2.16605616 -1.737980893
## 16 -2.09777970 -2.391303176
## 17 -2.11003673 -1.383989325
## 18 -2.17836253 -0.483061847
## 19 -1.83599672 -1.355959687
## 20 -2.31125904 -1.046088311
## 21 -1.93737228 -0.445315773
## 22 -2.14789585 -0.870075742
## 23 -2.87087622 -0.371786247
## 24 -1.75630224 -0.112582094
## 25 -2.30593126 -0.128572099
## 26 -2.01143673 0.587476488
## 27 -2.01927769 -0.247388061
## 28 -2.19927409 -0.530725435
## 29 -2.17458180 -0.333747858
## 30 -2.36319236 0.338089547
## 31 -2.22443371 0.487464454
## 32 -1.76841797 -0.455201848
## 33 -2.67622119 -1.610906247
## 34 -2.39807281 -1.942435053
## 35 -2.18117474 0.433813350
## 36 -2.26606711 0.179855760
## 37 -2.05803552 -0.678662754
## 38 -2.66185534 -0.545552734
## 39 -2.58060936 0.945557589
## 40 -2.21341474 -0.283383488
## 41 -2.28404497 -0.426110240
## 42 -1.88759440 2.516670631
## 43 -2.70437456 0.526866756
## 44 -1.88695330 -0.454081303
## 45 -2.07118404 -1.064343709
## 46 -2.10210781 0.696473119
## 47 -2.40371982 -1.040620003
## 48 -2.50786821 0.402470021
## 49 -2.24054721 -0.959906308
## 50 -2.26049417 -0.028181150
## 51 1.12507182 -0.903871861
## 52 0.79118659 -0.657201417
## 53 1.26040902 -0.667116963
## 54 0.55272346 1.839079983
## 55 1.13447975 0.155012571
## 56 0.49256530 0.525384382
## 57 0.79884457 -0.826102580
## 58 -0.38054459 1.933801527
## 59 0.99984229 -0.100525205
## 60 0.08880332 1.016640487
## 61 0.07447330 2.939105790
## 62 0.51707110 -0.007629525
## 63 0.74725565 1.876352767
## 64 0.80580689 0.108088184
## 65 0.06727687 0.376717161
## 66 0.92670853 -0.568734868
## 67 0.42293272 0.126858901
## 68 0.28955671 0.729883259
## 69 1.36043786 1.751163755
## 70 0.30507192 1.300403803
## 71 0.76560630 -0.454505340
## 72 0.57888769 0.353645374
## 73 1.32475341 0.933440428
## 74 0.74238439 0.338583304
## 75 0.78831001 -0.008025008
## 76 0.93388033 -0.315878681
## 77 1.30442768 0.030080677
## 78 1.36606600 -0.385504872
## 79 0.74120785 0.154788829
## 80 0.09823040 1.027682646
## 81 0.27891782 1.599274503
## 82 0.17594741 1.606664291
## 83 0.36211946 0.726339119
## 84 1.13052928 0.583859684
## 85 0.28892996 0.226368291
## 86 0.49503334 -0.885318287
## 87 1.08659598 -0.581099441
## 88 1.18397246 1.456428797
## 89 0.16679916 0.146170848
## 90 0.40465456 1.338171087
## 91 0.39906305 1.091427772
## 92 0.70903141 -0.105852971
## 93 0.46389004 0.957178420
## 94 -0.23098983 2.132630823
## 95 0.39648578 0.809743029
## 96 0.19921519 0.099284236
## 97 0.32879173 0.311682303
## 98 0.67132654 0.078845991
## 99 -0.33398735 1.581218162
## 100 0.36170012 0.536454487
## 101 1.71009852 -0.894052241
## 102 1.17750703 0.665311848
## 103 2.09602938 -0.583674198
## 104 1.45686536 -0.020733757
## 105 1.79071937 -0.342789495
## 106 2.57579797 -0.789255221
## 107 0.38661731 1.624346446
## 108 2.22872233 -0.443217306
## 109 2.02202320 0.728517750
## 110 2.06255070 -1.845013769
## 111 1.33145478 -0.735101605
## 112 1.60565877 0.390424412
## 113 1.80608901 -0.454476108
## 114 1.27553784 1.185649555
## 115 1.37421871 0.420316127
## 116 1.49802971 -0.708627142
## 117 1.47524518 -0.320188330
## 118 2.24416671 -2.405172717
## 119 3.09265796 0.059744071
## 120 1.40319967 1.827045856
## 121 1.90606835 -0.925524073
## 122 0.97032438 0.535629764
## 123 2.73261448 -0.385540928
## 124 1.36076243 0.448269952
## 125 1.63330806 -1.040666715
## 126 1.89192522 -1.032572862
## 127 1.20180371 0.267191352
## 128 1.04492149 -0.129797369
## 129 1.74510746 0.147351199
## 130 1.83436773 -0.600722165
## 131 2.33505041 -0.256590147
## 132 2.13401639 -2.460027089
## 133 1.79784284 0.143881556
## 134 1.18290593 0.226688357
## 135 1.30221896 0.766418300
## 136 2.56490300 -0.818062253
## 137 1.47058012 -1.077939890
## 138 1.35553596 -0.489475118
## 139 0.95130018 -0.081802256
## 140 1.76458306 -0.703363361
## 141 1.87704707 -0.638641032
## 142 1.77095437 -0.701937529
## 143 1.17750703 0.665311848
## 144 1.91770918 -0.891113793
## 145 1.83877014 -1.054184779
## 146 1.75787163 -0.412513140
## 147 1.58947600 0.915408402
## 148 1.48793628 -0.319178298
## 149 1.29529989 -1.025276257
## 150 0.98752038 -0.044117086
ggplot(data = iris_PCaret, aes(x = PC1, y = PC2, color = iris$Species, label = iris$Species)) +
geom_text(alpha = 0.8, size = 4) +
ggtitle("First two PC of Iris data") +
theme_minimal()
# We can see the results are the same as above
Challenge
Perform a PCA on the Ames housing filtered and unfiltered datasets. How much variance is explained by the top components? What is the difference between including and not including the outlier points?
{: .source}
Solution
numericVarsWoutSale <- numericVars[1:length(numericVars) - 1] table(complete.cases(ameshousingFilt[,numericVarsWoutSale])) ames_pca <- prcomp(ameshousingFilt[complete.cases(ameshousingFilt[,numericVarsWoutSale]), numericVarsWoutSale], center = TRUE, scale = TRUE) fviz_screeplot(ames_pca, addlabels = TRUE, ylim = c(0, 25)) fviz_pca_var(ames_pca, col.var="contrib", gradient.cols = c("red", "blue", "black"), repel = TRUE) # Contributions of variables to PC1 fviz_contrib(ames_pca, choice = "var", axes = 1, top = 10) # Contributions of variables to PC2 fviz_contrib(ames_pca, choice = "var", axes = 2, top = 10) fviz_pca_ind(ames_pca,label = "none") table(complete.cases(ameshousing[,numericVarsWoutSale])) ames_pca2 <- prcomp(ameshousing[complete.cases(ameshousing[,numericVarsWoutSale]), numericVarsWoutSale], center = TRUE, scale = TRUE) fviz_screeplot(ames_pca2, addlabels = TRUE, ylim = c(0, 25)) fviz_pca_var(ames_pca2, col.var="contrib", gradient.cols = c("red", "blue", "black"), repel = TRUE) # Contributions of variables to PC1 fviz_contrib(ames_pca2, choice = "var", axes = 1, top = 10) # Contributions of variables to PC2 fviz_contrib(ames_pca2, choice = "var", axes = 2, top = 10) fviz_pca_ind(ames_pca2,label = "none"){: .output} {: .solution} {: .challenge}